06 - Facies Classifier

George Crowther

This is an extension / amalgamation of prior entries. The workflow remains not dissimilar to those completed previously, this is:

  • Load and set strings to integers
  • Cursory data examination, this workbook does not attempt to detail the full data analysis
  • Group data by well and brute force feature creation
    • Feature creation focuses on bringing results from adjacent samples into features
    • Look at some ratios between features
  • Leaving out two wells at a time, use TPOT to generate a pipeline for prediction.
  • Modal vote on fitted model predicting on the test data set.

In [1]:
import pandas as pd
import bokeh.plotting as bk
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from tpot import TPOTClassifier, TPOTRegressor

import sys
sys.path.append('~/home/slygeorge/Documents/Python/SEG ML Competition')

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

bk.output_notebook()


/home/slygeorge/anaconda3/envs/seg_competition/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
Loading BokehJS ...

In [98]:
from scipy.stats import mode
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFwe, SelectKBest, f_classif, SelectPercentile
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, Binarizer, Normalizer, StandardScaler
from xgboost import XGBClassifier

In [115]:
models = [
    make_pipeline(
    MinMaxScaler(),
    XGBClassifier(learning_rate=0.02, max_depth=5, min_child_weight=20, n_estimators=500, subsample=0.19)
),
    make_pipeline(
    make_union(VotingClassifier([("est", LogisticRegression(C=0.13, dual=False, penalty="l1"))]), FunctionTransformer(lambda X: X)),
    RandomForestClassifier(n_estimators=500)
),
    make_pipeline(
    Binarizer(threshold=0.72),
    RandomForestClassifier(n_estimators=500)
),
    make_pipeline(
    make_union(VotingClassifier([("est", GradientBoostingClassifier(learning_rate=1.0, max_features=1.0, n_estimators=500))]), FunctionTransformer(lambda X: X)),
    BernoulliNB(alpha=28.0, binarize=0.85, fit_prior=True)
),
    make_pipeline(
    Normalizer(norm="l1"),
    make_union(VotingClassifier([("est", RandomForestClassifier(n_estimators=500))]), FunctionTransformer(lambda X: X)),
    SelectKBest(k=47, score_func=f_classif),
    SelectFwe(alpha=0.05, score_func=f_classif),
    RandomForestClassifier(n_estimators=500)
),
    make_pipeline(
    make_union(VotingClassifier([("est", LinearSVC(C=0.26, dual=False, penalty="l2"))]), FunctionTransformer(lambda X: X)),
    RandomForestClassifier(n_estimators=500)
),
    make_pipeline(
    Normalizer(norm="l2"),
    make_union(VotingClassifier([("est", ExtraTreesClassifier(criterion="entropy", max_features=0.3, n_estimators=500))]), FunctionTransformer(lambda X: X)),
    GaussianNB()
),
    make_pipeline(
    make_union(VotingClassifier([("est", BernoulliNB(alpha=49.0, binarize=0.06, fit_prior=True))]), FunctionTransformer(lambda X: X)),
    StandardScaler(),
    make_union(VotingClassifier([("est", GradientBoostingClassifier(learning_rate=0.87, max_features=0.87, n_estimators=500))]), FunctionTransformer(lambda X: X)),
    ExtraTreesClassifier(criterion="entropy", max_features=0.001, n_estimators=500)
),
    make_pipeline(
    make_union(VotingClassifier([("est", RandomForestClassifier(n_estimators=500))]), FunctionTransformer(lambda X: X)),
    BernoulliNB(alpha=1e-06, binarize=0.09, fit_prior=True)
),
    make_pipeline(
    Normalizer(norm="max"),
    MinMaxScaler(),
    RandomForestClassifier(n_estimators=500)
),
    make_pipeline(
    SelectPercentile(percentile=18, score_func=f_classif),
    RandomForestClassifier(n_estimators=500)
),
    make_pipeline(
    SelectKBest(k=50, score_func=f_classif),
    RandomForestClassifier(n_estimators=500)
),
    make_pipeline(
    XGBClassifier(learning_rate=0.51, max_depth=10, min_child_weight=20, n_estimators=500, subsample=1.0)
),
    make_pipeline(
    make_union(VotingClassifier([("est", KNeighborsClassifier(n_neighbors=5, weights="uniform"))]), FunctionTransformer(lambda X: X)),
    RandomForestClassifier(n_estimators=500)
),
    make_pipeline(
    StandardScaler(),
    SelectPercentile(percentile=19, score_func=f_classif),
    LinearSVC(C=0.02, dual=False, penalty="l1")
),
    make_pipeline(
    XGBClassifier(learning_rate=0.01, max_depth=10, min_child_weight=20, n_estimators=500, subsample=0.36)
)]

In [125]:
train_path = '../training_data.csv'
test_path = '../validation_data_nofacies.csv'

facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS', 'WS', 'D', 'PS', 'BS']


def feature_extraction(file_path, retain_class = True):

    # Read training data to dataframe
    test = pd.read_csv(file_path)
    
    if 'Facies' in test.columns:
        test.rename(columns={'Facies': 'class'}, inplace=True)

    # Set string features to integers

    for i, value in enumerate(test['Formation'].unique()):
        test.loc[test['Formation'] == value, 'Formation'] = i

    for i, value in enumerate(test['Well Name'].unique()):
        test.loc[test['Well Name'] == value, 'Well Name'] = i

    # The first thing that will be done is to upsample and interpolate the training data,
    # the objective here is to provide significantly more samples to train the regressor on and
    # also to capture more of the sample interdependancy.
    upsampled_arrays = []
    test['orig_index'] = test.index

    # Use rolling windows through upsampled frame, grouping by well name.

    # Empty list to hold frames
    mean_frames = []
    above = []
    below = []

    for well, group in test.groupby('Well Name'):
        # Empty list to hold rolling frames
        constructor_list = []
        for f in resample_factors:

            working_frame = group[['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M',
           'RELPOS']]

            mean_frame = working_frame.rolling(window = f, center = True).mean().interpolate(method = 'index', limit_direction = 'both', limit = None)
            mean_frame.columns = ['Mean_{0}_{1}'.format(f, column) for column in mean_frame.columns]
            max_frame = working_frame.rolling(window = f, center = True).max().interpolate(method = 'index', limit_direction = 'both', limit = None)
            max_frame.columns = ['Max_{0}_{1}'.format(f, column) for column in max_frame.columns]
            min_frame = working_frame.rolling(window = f, center = True).min().interpolate(method = 'index', limit_direction = 'both', limit = None)
            min_frame.columns = ['Min_{0}_{1}'.format(f, column) for column in min_frame.columns]
            std_frame = working_frame.rolling(window = f, center = True).std().interpolate(method = 'index', limit_direction = 'both', limit = None)
            std_frame.columns = ['Std_{0}_{1}'.format(f, column) for column in std_frame.columns]
            var_frame = working_frame.rolling(window = f, center = True).var().interpolate(method = 'index', limit_direction = 'both', limit = None)
            var_frame.columns = ['Var_{0}_{1}'.format(f, column) for column in var_frame.columns]
            diff_frame = working_frame.diff(f, axis = 0).interpolate(method = 'index', limit_direction = 'both', limit = None)
            diff_frame.columns = ['Diff_{0}_{1}'.format(f, column) for column in diff_frame.columns]
            rdiff_frame = working_frame.sort_index(ascending = False).diff(f, axis = 0).interpolate(method = 'index', limit_direction = 'both', limit = None).sort_index()
            rdiff_frame.columns = ['Rdiff_{0}_{1}'.format(f, column) for column in rdiff_frame.columns]
            skew_frame = working_frame.rolling(window = f, center = True).skew().interpolate(method = 'index', limit_direction = 'both', limit = None)
            skew_frame.columns = ['Skew_{0}_{1}'.format(f, column) for column in skew_frame.columns]

            f_frame = pd.concat((mean_frame, max_frame, min_frame, std_frame, var_frame, diff_frame, rdiff_frame), axis = 1)

            constructor_list.append(f_frame)

        well_frame = pd.concat(constructor_list, axis = 1)
        well_frame['Well Name'] = well
        # orig index is holding the original index locations, to make extracting the results trivial
        well_frame['orig_index'] = group['orig_index']
        df = group.sort_values('Depth')
        u = df.shift(-1).fillna(method = 'ffill')
        b = df.shift(1).fillna(method = 'bfill')
        above.append(u[div_columns])
        below.append(b[div_columns])

        mean_frames.append(well_frame.fillna(method = 'bfill').fillna(method = 'ffill'))

    frame = test
    frame.index = frame['orig_index']
    frame.drop(['orig_index', 'Well Name'], axis = 1, inplace = True)

    for f in mean_frames:
        f.index = f['orig_index']

    rolling_frame = pd.concat(mean_frames, axis = 0)
    above_frame = pd.concat(above)
    above_frame.columns = ['above_'+ column for column in above_frame.columns]
    below_frame = pd.concat(below)
    below_frame.columns = ['below_'+ column for column in below_frame.columns]
    upsampled_frame = pd.concat((frame, rolling_frame, above_frame, below_frame), axis = 1)

    features = [feature for feature in upsampled_frame.columns if 'class' not in feature]

    std_scaler = preprocessing.StandardScaler().fit(upsampled_frame[features])
    train_std = std_scaler.transform(upsampled_frame[features])

    train_std_frame = upsampled_frame
    for i, column in enumerate(features):
        train_std_frame.loc[:, column] = train_std[:, i]

    upsampled_frame_std = train_std_frame

    for feature in div_columns:
        for f in div_columns:
            if f == feature:
                continue
            upsampled_frame['{0}_{1}'.format(feature, f)] = upsampled_frame[f] / upsampled_frame[feature]
 
    return upsampled_frame_std, features

train_data_set, features = feature_extraction(train_path)
test_data_set, test_features = feature_extraction(test_path)

In [126]:
train_data_set.head()


Out[126]:
class Formation Depth GR ILD_log10 DeltaPHI PHIND PE NM_M RELPOS ... NM_M_PHIND NM_M_PE NM_M_RELPOS RELPOS_Depth RELPOS_GR RELPOS_ILD_log10 RELPOS_DeltaPHI RELPOS_PHIND RELPOS_PE RELPOS_NM_M
0 3 -1.346013 -0.632316 0.366749 0.088008 1.212737 -0.203723 0.976532 -0.996911 1.672943 ... 0.204354 -0.979558 -1.678127 -0.377966 0.219224 0.052606 0.724912 -0.121775 0.583721 -0.595902
1 3 -1.346013 -0.628499 0.393005 0.075601 2.035209 -0.119283 0.418505 -0.996911 1.599708 ... 0.119652 -0.419802 -1.604665 -0.392884 0.245673 0.047259 1.272238 -0.074565 0.261613 -0.623183
2 3 -1.346013 -0.624682 0.418613 0.063194 2.149973 -0.056278 -0.139522 -0.996911 1.522985 ... 0.056452 0.139955 -1.527705 -0.410169 0.274863 0.041494 1.411683 -0.036952 -0.091611 -0.654577
3 3 -1.346013 -0.620865 0.647138 0.050788 1.977828 -0.047834 -0.251128 -0.996911 1.449750 ... 0.047982 0.251906 -1.454243 -0.428256 0.446379 0.035032 1.364254 -0.032994 -0.173221 -0.687643
4 3 -1.346013 -0.617047 0.273719 0.017704 1.901319 -0.023801 -0.362733 -0.996911 1.376515 ... 0.023875 0.363857 -1.380781 -0.448268 0.198849 0.012861 1.381255 -0.017291 -0.263516 -0.724228

5 rows × 364 columns


In [127]:
lpgo = LeavePGroupsOut(2)

split_list = []
fitted_models = []

for train, val in lpgo.split(train_data_set[features], 
                             train_data_set['class'], 
                             groups = train_data_set['Well Name']):
    hist_tr = np.histogram(train_data_set.loc[train, 'class'], 
                           bins = np.arange(len(facies_labels) + 1) + 0.5)
    hist_val = np.histogram(train_data_set.loc[val, 'class'],
                           bins = np.arange(len(facies_labels) + 1) + 0.5)
    if np.all(hist_tr[0] != 0) & np.all(hist_val[0] != 0):
        split_list.append({'train': train, 'val': val})
        
for s, split in enumerate(split_list):
    print('Split %d' % s)
    print('    training:   %s' % (train_data_set['Well Name'].loc[split['train']].unique()))
    print('    validation: %s' % (train_data_set['Well Name'].loc[split['val']].unique()))


Split 0
    training:   [-0.53211804 -0.10387962  0.3243588   0.75259723  1.18083565  1.60907407]
    validation: [-1.38859488 -0.96035646]
Split 1
    training:   [-0.96035646 -0.53211804  0.3243588   0.75259723  1.18083565  1.60907407]
    validation: [-1.38859488 -0.10387962]
Split 2
    training:   [-0.96035646 -0.53211804 -0.10387962  0.75259723  1.18083565  1.60907407]
    validation: [-1.38859488  0.3243588 ]
Split 3
    training:   [-0.96035646 -0.53211804 -0.10387962  0.3243588   0.75259723  1.18083565]
    validation: [-1.38859488  1.60907407]
Split 4
    training:   [-1.38859488 -0.53211804 -0.10387962  0.3243588   1.18083565  1.60907407]
    validation: [-0.96035646  0.75259723]
Split 5
    training:   [-1.38859488 -0.53211804 -0.10387962  0.3243588   0.75259723  1.60907407]
    validation: [-0.96035646  1.18083565]
Split 6
    training:   [-1.38859488 -0.53211804 -0.10387962  0.3243588   0.75259723  1.18083565]
    validation: [-0.96035646  1.60907407]
Split 7
    training:   [-1.38859488 -0.96035646 -0.10387962  0.3243588   0.75259723  1.18083565]
    validation: [-0.53211804  1.60907407]
Split 8
    training:   [-1.38859488 -0.96035646 -0.53211804  0.3243588   1.18083565  1.60907407]
    validation: [-0.10387962  0.75259723]
Split 9
    training:   [-1.38859488 -0.96035646 -0.53211804  0.3243588   0.75259723  1.60907407]
    validation: [-0.10387962  1.18083565]
Split 10
    training:   [-1.38859488 -0.96035646 -0.53211804  0.3243588   0.75259723  1.18083565]
    validation: [-0.10387962  1.60907407]
Split 11
    training:   [-1.38859488 -0.96035646 -0.53211804 -0.10387962  1.18083565  1.60907407]
    validation: [ 0.3243588   0.75259723]
Split 12
    training:   [-1.38859488 -0.96035646 -0.53211804 -0.10387962  0.75259723  1.60907407]
    validation: [ 0.3243588   1.18083565]
Split 13
    training:   [-1.38859488 -0.96035646 -0.53211804 -0.10387962  0.75259723  1.18083565]
    validation: [ 0.3243588   1.60907407]
Split 14
    training:   [-1.38859488 -0.96035646 -0.53211804 -0.10387962  0.3243588   1.18083565]
    validation: [ 0.75259723  1.60907407]
Split 15
    training:   [-1.38859488 -0.96035646 -0.53211804 -0.10387962  0.3243588   0.75259723]
    validation: [ 1.18083565  1.60907407]

In [135]:
fitted_models = []
r = []

for i, split in enumerate(split_list):

    # Select training and validation data from current split
    X_tr = train_data_set.loc[split['train'], features]
    X_v = train_data_set.loc[split['val'], features]
    y_tr = train_data_set.loc[split['train'], 'class']
    y_v = train_data_set.loc[split['val'], 'class']

    # Fit model from split
    fitted_models.append(models[i].fit(X_tr, y_tr))
    
    # Predict for model
    r.append(fitted_models[-1].predict(test_data_set[test_features]))
    
results = mode(np.vstack(r))[0][0]

test_data_set['Facies'] = results


/home/slygeorge/anaconda3/envs/seg_competition/lib/python3.5/site-packages/sklearn/feature_selection/univariate_selection.py:114: RuntimeWarning: divide by zero encountered in true_divide
  f = msb / msw

In [137]:
test_data_set.iloc[:, ::-1].head()


Out[137]:
Facies RELPOS_NM_M RELPOS_PE RELPOS_PHIND RELPOS_DeltaPHI RELPOS_ILD_log10 RELPOS_GR RELPOS_Depth NM_M_RELPOS NM_M_PE ... Mean_2_Depth RELPOS NM_M PE PHIND DeltaPHI ILD_log10 GR Depth Formation
orig_index
0 2 -0.884952 -0.059289 -0.118109 0.079374 -0.076787 0.191931 -1.156836 -1.130005 0.066997 ... -1.892984 1.640888 -1.452107 -0.097287 -0.193803 0.130243 -0.125999 0.314937 -1.898239 -1.351854
1 2 -0.928980 -0.308522 0.036349 0.678435 -0.180501 0.456716 -1.211001 -1.076450 0.332108 ... -1.892984 1.563120 -1.452107 -0.482257 0.056818 1.060476 -0.282144 0.713902 -1.892939 -1.351854
2 2 -0.977618 -0.611844 0.252408 1.281514 -0.234336 0.618819 -1.270837 -1.022894 0.625852 ... -1.887683 1.485351 -1.452107 -0.908803 0.374915 1.903499 -0.348072 0.919164 -1.887639 -1.351854
3 2 -1.034229 -0.742690 0.218967 1.376426 -0.181180 0.596973 -1.340651 -0.966904 0.718110 ... -1.882383 1.404048 -1.452107 -1.042773 0.307440 1.932569 -0.254385 0.838179 -1.882338 -1.351854
4 2 -1.094872 -0.736314 0.100984 1.281789 -0.074072 0.503166 -1.415266 -0.913348 0.672511 ... -1.877082 1.326280 -1.452107 -0.976558 0.133933 1.700011 -0.098240 0.667339 -1.877038 -1.351854

5 rows × 364 columns


In [139]:
test_data_set.iloc[:, ::-1].to_csv('06 - Combined Models.csv')

In [ ]: